{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入相关的包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "import pymysql\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "plt.rcParams['font.family']='SimHei'\n",
    "plt.rcParams['axes.unicode_minus']= False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据\n",
    "data=pd.read_csv('ab_data.csv',sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>group</th>\n",
       "      <th>landing_page</th>\n",
       "      <th>converted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>851104</td>\n",
       "      <td>2017-01-21 22:11:48.556739</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>804228</td>\n",
       "      <td>2017-01-12 08:01:45.159739</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>661590</td>\n",
       "      <td>2017-01-11 16:55:06.154213</td>\n",
       "      <td>treatment</td>\n",
       "      <td>new_page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>853541</td>\n",
       "      <td>2017-01-08 18:28:03.143765</td>\n",
       "      <td>treatment</td>\n",
       "      <td>new_page</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>864975</td>\n",
       "      <td>2017-01-21 01:52:26.210827</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id                   timestamp      group landing_page  converted\n",
       "0   851104  2017-01-21 22:11:48.556739    control     old_page          0\n",
       "1   804228  2017-01-12 08:01:45.159739    control     old_page          0\n",
       "2   661590  2017-01-11 16:55:06.154213  treatment     new_page          0\n",
       "3   853541  2017-01-08 18:28:03.143765  treatment     new_page          0\n",
       "4   864975  2017-01-21 01:52:26.210827    control     old_page          1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"date\"] = data.timestamp.str[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>group</th>\n",
       "      <th>landing_page</th>\n",
       "      <th>converted</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>851104</td>\n",
       "      <td>2017-01-21 22:11:48.556739</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>0</td>\n",
       "      <td>2017-01-21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>804228</td>\n",
       "      <td>2017-01-12 08:01:45.159739</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>0</td>\n",
       "      <td>2017-01-12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>661590</td>\n",
       "      <td>2017-01-11 16:55:06.154213</td>\n",
       "      <td>treatment</td>\n",
       "      <td>new_page</td>\n",
       "      <td>0</td>\n",
       "      <td>2017-01-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>853541</td>\n",
       "      <td>2017-01-08 18:28:03.143765</td>\n",
       "      <td>treatment</td>\n",
       "      <td>new_page</td>\n",
       "      <td>0</td>\n",
       "      <td>2017-01-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>864975</td>\n",
       "      <td>2017-01-21 01:52:26.210827</td>\n",
       "      <td>control</td>\n",
       "      <td>old_page</td>\n",
       "      <td>1</td>\n",
       "      <td>2017-01-21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id                   timestamp      group landing_page  converted  \\\n",
       "0   851104  2017-01-21 22:11:48.556739    control     old_page          0   \n",
       "1   804228  2017-01-12 08:01:45.159739    control     old_page          0   \n",
       "2   661590  2017-01-11 16:55:06.154213  treatment     new_page          0   \n",
       "3   853541  2017-01-08 18:28:03.143765  treatment     new_page          0   \n",
       "4   864975  2017-01-21 01:52:26.210827    control     old_page          1   \n",
       "\n",
       "         date  \n",
       "0  2017-01-21  \n",
       "1  2017-01-12  \n",
       "2  2017-01-11  \n",
       "3  2017-01-08  \n",
       "4  2017-01-21  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 根据常规默认值确定α ,β,K 值 α = 0.05, β = 0.2 ,K = 1 (组件样本均衡)\n",
    "alpha = 0.05\n",
    "beta = 0.2\n",
    "k =1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "group      landing_page\n",
       "control    new_page          1928\n",
       "           old_page        145274\n",
       "treatment  new_page        145311\n",
       "           old_page          1965\n",
       "Name: user_id, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看样本量是否充足\n",
    "data.groupby(['group','landing_page'])['user_id'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "date\n",
       "2017-01-02     5783\n",
       "2017-01-03    13394\n",
       "2017-01-04    13284\n",
       "2017-01-05    13124\n",
       "2017-01-06    13528\n",
       "2017-01-07    13381\n",
       "2017-01-08    13564\n",
       "2017-01-09    13439\n",
       "2017-01-10    13523\n",
       "2017-01-11    13553\n",
       "2017-01-12    13322\n",
       "2017-01-13    13238\n",
       "2017-01-14    13329\n",
       "2017-01-15    13449\n",
       "2017-01-16    13327\n",
       "2017-01-17    13322\n",
       "2017-01-18    13285\n",
       "2017-01-19    13293\n",
       "2017-01-20    13393\n",
       "2017-01-21    13475\n",
       "2017-01-22    13423\n",
       "2017-01-23    13511\n",
       "2017-01-24     7538\n",
       "Name: user_id, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#每天的用户量统计\n",
    "data.groupby('date')['user_id'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.1203863045004612 0.11880724790277405\n"
     ]
    }
   ],
   "source": [
    "#控制组用户点击率\n",
    "p_a= data[(data['group']=='control') &(data['landing_page']=='old_page')]['converted'].mean()\n",
    "#实验组用户点击率\n",
    "p_b= data[(data['group']=='treatment') &(data['landing_page']=='new_page')]['converted'].mean()\n",
    "print(p_a,p_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-0.0015790565976871451"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#求统计量值\n",
    "tongjiliang=p_b-p_a\n",
    "tongjiliang"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6590 6618\n"
     ]
    }
   ],
   "source": [
    "#控制组用户用户数\n",
    "n1= data[(data['date']=='2017-01-03')&(data['group']=='control') &(data['landing_page']=='old_page')]['converted'].size\n",
    "#实验组用户用户数\n",
    "n2= data[(data['date']=='2017-01-03')&(data['group']=='treatment') &(data['landing_page']=='new_page')]['converted'].size\n",
    "print(n1,n2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.188810132229876e-05"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#求方差\n",
    "var = p_b * (1-p_b)/n2  + p_a * (1-p_a)/n1\n",
    "var"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.610119207871177"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tongjiliang_p = 1-stats.norm.cdf(tongjiliang,0,np.sqrt(var))\n",
    "tongjiliang_p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "P值大于显著性水平α，接受原假设实验组，说明实验组点击率 < 对照组\n"
     ]
    }
   ],
   "source": [
    "if(tongjiliang_p > alpha):\n",
    "    print(\"P值大于显著性水平α，接受原假设实验组，说明实验组点击率 < 对照组\")\n",
    "else:\n",
    "    print(\"P值小于显著性水平α，接受备择假设，说明实验组点击率 > 对照组\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ABTest封装函数\n",
    "def abtest(df: pd.DataFrame, alpha=0.05, group_col: str = None, value_col: str =None):\n",
    "    '''\n",
    "        :param df: 被分析DateFrame对象\n",
    "        :param alpha: 临界值\n",
    "        :param group_col: 组列的名字，默认为df的第一列\n",
    "        :param value_col: 值列的名字,默认为df的第2列\n",
    "        :return: tongjiliang, pvalue, ptype\n",
    "    '''\n",
    "    # 列名\n",
    "    if not group_col:\n",
    "        group_col = df.columns[0]\n",
    "    if not value_col:\n",
    "        value_col = df.columns[1]\n",
    "    mean= df.groupby(group_col)[value_col].mean()\n",
    "    num = df.groupby(group_col)[value_col].count() \n",
    "    # 求统计量\n",
    "    tongjiliang=mean[0]-mean[1]\n",
    "    #求方差\n",
    "    var = mean[0] * (1-mean[0])/num[1]  + mean[1] * (1-mean[1])/num[1]\n",
    "    # 初始化返回数据\n",
    "    pdf = pd.DataFrame(columns=['tongjiliang', 'pvalue', 'ptype'])\n",
    "    # 计算差异性\n",
    "    pvalue =1-stats.norm.cdf(tongjiliang,0,np.sqrt(var))\n",
    "    if pvalue >= alpha:\n",
    "        ptype = \"无显著差异\"\n",
    "    else:\n",
    "        ptype = \"有显著差异\"\n",
    "    # 添加数据\n",
    "    pdf.loc[pdf.shape[0]] = {'tongjiliang':tongjiliang, 'pvalue': pvalue, 'ptype': ptype}\n",
    "    return pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>group</th>\n",
       "      <th>converted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>control</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>control</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>treatment</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>treatment</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>control</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       group  converted\n",
       "0    control          0\n",
       "1    control          0\n",
       "2  treatment          0\n",
       "3  treatment          0\n",
       "4    control          1"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=data[['group','converted']]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tongjiliang</th>\n",
       "      <th>pvalue</th>\n",
       "      <th>ptype</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00148</td>\n",
       "      <td>0.108029</td>\n",
       "      <td>无显著差异</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   tongjiliang    pvalue  ptype\n",
       "0      0.00148  0.108029  无显著差异"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "abtest(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "实验结论：\n",
    "接受原假设，说明新的广告banner没有显著性提升"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
